library(knitr)
knitr::opts_chunk$set(cache = TRUE, warning = FALSE, 
                      message = FALSE, cache.lazy = FALSE, cache.path = "../../gitignore/RmdCache/", # keep heavy data in gitignore cache
                      fig.path = "Rfigures/Rmd/")
machine="mythinkpad" # define the machine we work on
loadALL = FALSE # only load CpG shared by half fish per trt group
loadannot = TRUE # load genome annotations
sourceDMS = TRUE # Load the calculated DMS
sourceSubUnite = FALSE
source("R02.3_DATALOAD.R")

1 Bottom up analysis: Differential Methylation in offspring

1.1 Differential methylation in all G2 (brother pair and sex as covariate)

1.1.1 TREATMENT EFFECT: Differential methylation between control and infected, within each parental treatment

## Features Annotation (use package genomation v1.24.0)
## NB Promoters are defined by options at genomation::readTranscriptFeatures function. 
## The default option is to take -1000,+1000bp around the TSS and you can change that. 
## -> following Heckwolf 2020 and Sagonas 2020, we consider 1500bp upstream and 500 bp downstream

## Parents comparison:
diffAnn_PAR = annotateWithGeneParts(as(DMSlist$DMS_15pc_G1_C_T,"GRanges"),annotBed12)
## Offspring from control parents comparison: 
diffAnn_G2_controlG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_CC_CT,"GRanges"),annotBed12)
## Offspring from infected parents comparison:
diffAnn_G2_infectedG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_TC_TT,"GRanges"),annotBed12)
par(mfrow=c(1,3))
par(mar = c(.1,0.1,5,0.1)) # Set the margin on all sides to 2
## Parents comparison:
diffAnn_PAR = annotateWithGeneParts(as(DMSlist$DMS_15pc_G1_C_T,"GRanges"),annotBed12)
diffAnn_PAR
##   promoter       exon     intron intergenic 
##       8.58      15.84      34.13      45.72 
##   promoter       exon     intron intergenic 
##       8.58      13.19      32.51      45.72 
## promoter     exon   intron 
##     1.07     0.19     0.44 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       3    3020    8128   15141   19226  300247
genomation::plotTargetAnnotation(diffAnn_PAR,precedence=TRUE, main="DMS G1", cex.legend = 1, border="white")

## Offspring from control parents comparison:
diffAnn_G2_controlG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_CC_CT,"GRanges"),annotBed12)
diffAnn_G2_controlG1
##   promoter       exon     intron intergenic 
##      11.28      21.05      30.16      44.44 
##   promoter       exon     intron intergenic 
##      11.28      16.21      28.07      44.44 
## promoter     exon   intron 
##     0.38     0.07     0.14 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      11    2602    6295   14212   15906  261017
genomation::plotTargetAnnotation(diffAnn_G2_controlG1,precedence=TRUE, main="DMS G2-G1c", cex.legend = 1, border="white")
## Offspring from infected parents comparison:
diffAnn_G2_infectedG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_TC_TT,"GRanges"),annotBed12)
diffAnn_G2_infectedG1
##   promoter       exon     intron intergenic 
##      12.90      13.62      34.64      46.81 
##   promoter       exon     intron intergenic 
##      12.90       9.42      30.87      46.81 
## promoter     exon   intron 
##     0.28     0.03     0.08 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       9    2355    7150   12285   14773  109527
genomation::plotTargetAnnotation(diffAnn_G2_infectedG1,precedence=TRUE, main="DMS G2-G1i", cex.legend = 1, border="white")

par(mfrow=c(1,1))
## Run the function to get DMS info
DMS_info_G1 <- myDMSinfo(DMSlist$DMS_15pc_G1_C_T, uniteCov6_G1_woSexAndUnknowChrOVERLAP)
DMS_info_G2_G1c_final <- myDMSinfo(DMSlist$DMS_15pc_CC_CT, uniteCov14_G2_woSexAndUnknowChrOVERLAP)
DMS_info_G2_G1i_final <- myDMSinfo(DMSlist$DMS_15pc_TC_TT,uniteCov14_G2_woSexAndUnknowChrOVERLAP)

NB Kostas’ results: “We found a total of 1,973 CpG sites out of 1,172,887 CpGs (0.17%) across the genome that showed at least 15% differential fractional methylation (differentially methylated site [DMS]; q < 0.01) between infected and uninfected fish”

Here: we obtain out a total of 1001880 CpG sites: 3648 (0.36%) showed at least 15% differential fractional methylation (differentially methylated site [DMS]; q < 0.01) between infected and uninfected fish in the parental group; 1197 (0.12%) in the offspring from control parents comparison; 690 (0.07%) in the offspring from infected parents comparison.

1.1.1.1 Intersections of DMS: Venn diagrams

## Chi2 test: are the number of DMS from G2-G1C and G2-G1I overlapping with DMSpar statistically different?

A=length(intersect(DMS_info_G1$DMS,DMS_info_G2_G1c_final$DMS))
B=length(DMS_info_G2_G1c_final$DMS)
C=length(intersect(DMS_info_G1$DMS,DMS_info_G2_G1i_final$DMS))
D=length(DMS_info_G2_G1i_final$DMS)

Observed=matrix(c(A, B-A, C, D-C),nrow=2)
Observed
##      [,1] [,2]
## [1,]  106   59
## [2,] 1091  631
chisq.test(Observed)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Observed
## X-squared = 0.019909, df = 1, p-value = 0.8878
## not statistically different
## output Venn diagrams
allVenn <- ggVennDiagram(list("DMS G1" = DMS_info_G1$DMS, "DMS G2-c" = DMS_info_G2_G1c_final$DMS, "DMS G2-i" = DMS_info_G2_G1i_final$DMS), label_alpha = 0) +
  scale_fill_gradient(low="white",high = "red")
hypoVenn <- ggVennDiagram(list("DMS G1\nhypo" = DMS_info_G1$DMS[DMS_info_G1$direction %in% "hypo"],
                               "DMS G2-c\nhypo" = DMS_info_G2_G1c_final$DMS[DMS_info_G2_G1c_final$direction %in% "hypo"],
                               "DMS G2-i\nhypo" = DMS_info_G2_G1i_final$DMS[DMS_info_G2_G1i_final$direction %in% "hypo"]), label_alpha = 0) +
  scale_fill_gradient(low="white",high = "red")
hyperVenn <- ggVennDiagram(list("DMS G1\nhyper" = DMS_info_G1$DMS[DMS_info_G1$direction %in% "hyper"],
                                "DMS G2-c\nhyper" = DMS_info_G2_G1c_final$DMS[DMS_info_G2_G1c_final$direction %in% "hyper"],
                                "DMS G2-i\nhyper" = DMS_info_G2_G1i_final$DMS[DMS_info_G2_G1i_final$direction %in% "hyper"]), label_alpha = 0) +
  scale_fill_gradient(low="white",high = "red")

ggarrange(allVenn,
          ggarrange(hypoVenn, hyperVenn, ncol = 2, legend = "none"),
          nrow = 2, widths = c(.5,1))

1.1.1.1.1 Venn with annotated features
runHyperHypoAnnot <- function(){
  par(mfrow=c(2,3))
  par(mar = c(.1,0.1,5,0.1)) # Set the margin on all sides to 2
  ####### HYPO
  ## Parents comparison:
  A = annotateWithGeneParts(
    as(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hypo",],"GRanges"),annotBed12)
  genomation::plotTargetAnnotation(A,precedence=TRUE, main="DMS G1\nhypo", 
                                   cex.legend = .4, border="white")
  ## Offspring from control parents comparison:
  B = annotateWithGeneParts(
    as(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hypo",],"GRanges"),annotBed12)
  genomation::plotTargetAnnotation(B,precedence=TRUE, main="DMS G2-G1c\nhypo", 
                                   cex.legend = .4, border="white")
  ## Offspring from infected parents comparison:
  C = annotateWithGeneParts(
    as(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hypo",],"GRanges"),annotBed12)
  genomation::plotTargetAnnotation(C,precedence=TRUE, main="DMS G2-G1i\nhypo", 
                                   cex.legend = .4, border="white")
  ####### HYPER
  ## Parents comparison:
  D = annotateWithGeneParts(
    as(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hyper",],"GRanges"),annotBed12)
  genomation::plotTargetAnnotation(D,precedence=TRUE, main="DMS G1\nhyper", 
                                   cex.legend = .4, border="white")
  ## Offspring from control parents comparison:
  E = annotateWithGeneParts(
    as(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hyper",],"GRanges"),annotBed12)
  genomation::plotTargetAnnotation(E,precedence=TRUE, main="DMS G2-G1c\nhyper", 
                                   cex.legend = .4, border="white")
  ## Offspring from infected parents comparison:
  f = annotateWithGeneParts(
    as(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hyper",],"GRanges"),annotBed12)
  genomation::plotTargetAnnotation(f,precedence=TRUE, main="DMS G2-G1i\nhyper", 
                                   cex.legend = .4, border="white")
  par(mfrow=c(1,1))
  return(list(G1hypo=A, G2G1chypo=B, G2G1ihypo=C, G1hyper=D, G2G1chyper=E, G2G1ihyper=f))
}

myannot=runHyperHypoAnnot()

############################################################
## Venn diagram of overlapping features by their annotation:
table(rowSums(as.data.frame(myannot$G1hypo@members))) # NB: some positions are labelled with several features!
## 
##   0   1   2   3 
## 559 566  49   2
## as in MBE 2021: "giving precedence to the following order promoters, exons,
## introns, and intergenic regions when features overlapped"

myAnnotateDMS <- function(DMS, annot){
  ## sanity check
  if (nrow(DMS) != nrow(annot)){"STOP error in arguments"}
  DMS$pos <- paste(DMS$chr, DMS$start, DMS$end)
  ## NB as in MBE 2021: "giving precedence to the following order promoters, exons,
  ## introns, and intergenic regions when features overlapped"
  DMS$feature <- NA
  ## 1. promoters
  DMS$feature[which(annot$prom == 1)] = "promoter"
  ## 2. exons
  DMS$feature[which(annot$exon == 1 & annot$prom ==0)] = "exon"
  ## 3. intron
  DMS$feature[which(annot$intro == 1 & annot$exon == 0 & annot$prom ==0)] = "intron"
  ## 4. intergenic regions
  DMS$feature[which(annot$intro == 0 & annot$exon == 0 & annot$prom ==0)] = "intergenic"
  return(DMS)
}

DMSlist$DMS_15pc_G1_C_T = myAnnotateDMS(DMSlist$DMS_15pc_G1_C_T, as.data.frame(diffAnn_PAR@members))
DMSlist$DMS_15pc_G1_C_T_HYPO = myAnnotateDMS(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hypo",],
                                             as.data.frame(myannot$G1hypo@members))
DMSlist$DMS_15pc_G1_C_T_HYPER = myAnnotateDMS(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hyper",],
                                              as.data.frame(myannot$G1hyper@members))

DMSlist$DMS_15pc_CC_CT = myAnnotateDMS(DMSlist$DMS_15pc_CC_CT, as.data.frame(diffAnn_G2_controlG1@members))
DMSlist$DMS_15pc_CC_CT_HYPO = myAnnotateDMS(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hypo",],
                                            as.data.frame(myannot$G2G1chypo@members))
DMSlist$DMS_15pc_CC_CT_HYPER = myAnnotateDMS(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hyper",],
                                             as.data.frame(myannot$G2G1chyper@members))

DMSlist$DMS_15pc_TC_TT = myAnnotateDMS(DMSlist$DMS_15pc_TC_TT, as.data.frame(diffAnn_G2_infectedG1@members))
DMSlist$DMS_15pc_TC_TT_HYPO = myAnnotateDMS(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hypo",],
                                            as.data.frame(myannot$G2G1ihypo@members))
DMSlist$DMS_15pc_TC_TT_HYPER = myAnnotateDMS(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hyper",],
                                             as.data.frame(myannot$G2G1ihyper@members))

## Make Venn diagram for each feature
getFeatureDFHYPO <- function(myfeat){
  a = DMSlist$DMS_15pc_G1_C_T_HYPO$pos[DMSlist$DMS_15pc_G1_C_T_HYPO$feature %in% myfeat]
  b = DMSlist$DMS_15pc_CC_CT_HYPO$pos[DMSlist$DMS_15pc_CC_CT_HYPO$feature %in% myfeat]
  c = DMSlist$DMS_15pc_TC_TT_HYPO$pos[DMSlist$DMS_15pc_TC_TT_HYPO$feature %in% myfeat]
  return(list(a=a,b=b,c=c))
}

getFeatureDFHYPER <- function(myfeat){
  a = DMSlist$DMS_15pc_G1_C_T_HYPER$pos[DMSlist$DMS_15pc_G1_C_T_HYPER$feature %in% myfeat]
  b = DMSlist$DMS_15pc_CC_CT_HYPER$pos[DMSlist$DMS_15pc_CC_CT_HYPER$feature %in% myfeat]
  c = DMSlist$DMS_15pc_TC_TT_HYPER$pos[DMSlist$DMS_15pc_TC_TT_HYPER$feature %in% myfeat]
  return(list(a=a,b=b,c=c))
}

getVenn <- function(feat, direction){
  if (direction == "hypo"){
    ggVennDiagram(list(A = getFeatureDFHYPO(feat)[["a"]],
                       B = getFeatureDFHYPO(feat)[["b"]],
                       C = getFeatureDFHYPO(feat)[["c"]]), label_alpha = 0,
                  category.names = c(paste0("DMS G1\nhypo\n", feat), paste0("DMS G2-c\nhypo\n", feat), paste0("DMS G2-i\nhypo\n", feat))) +
      scale_fill_gradient(low="white",high = "red")
  } else if (direction == "hyper"){
    ggVennDiagram(list(A = getFeatureDFHYPER(feat)[["a"]],
                       B = getFeatureDFHYPER(feat)[["b"]],
                       C = getFeatureDFHYPER(feat)[["c"]]), label_alpha = 0,
                  category.names = c(paste0("DMS G1\nhyper\n", feat), paste0("DMS G2-c\nhyper\n", feat), paste0("DMS G2-i\nhyper\n", feat))) +
      scale_fill_gradient(low="white",high = "red")
  }
}
ggarrange(getVenn("promoter", "hypo"), getVenn("exon", "hypo"),
          getVenn("intron", "hypo"), getVenn("intergenic", "hypo"),
          nrow = 2, ncol = 2)

ggarrange(getVenn("promoter", "hyper"), getVenn("exon", "hyper"),
          getVenn("intron", "hyper"), getVenn("intergenic", "hyper"),
          nrow = 2, ncol = 2)

1.1.1.2 Manhattan plot of DMS

## Parents trt-ctrl
# load annotation
annot_PAR <- as.data.frame(diffAnn_PAR@members)
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_G1_C_T, annotFile = annot_PAR, GYgynogff = GYgynogff, 
                   mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G1 DMS")

## G2-G1c trt-ctrl
# load annotation
annot_G2_G1c <- as.data.frame(diffAnn_G2_controlG1@members)
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_CC_CT, annotFile = annot_G2_G1c, GYgynogff = GYgynogff, 
                   mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1c DMS")

## G2-G1i trt-ctrl
# load annotation
annot_G2_G1i <- as.data.frame(diffAnn_G2_infectedG1@members)
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_TC_TT, annotFile = annot_G2_G1i, GYgynogff = GYgynogff, 
                   mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1i DMS")

## Outliers in Manhattan plot: 15% diff + 2SD
outliers_G1_final <- which(abs(DMSlist$DMS_15pc_G1_C_T$meth.diff) > 15 + 2*sd(abs(DMSlist$DMS_15pc_G1_C_T$meth.diff)))
outliers_annot_G1 <- as.data.frame(diffAnn_PAR@members)[outliers_G1_final,]
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_G1_C_T[outliers_G1_final, ],
                   annotFile = outliers_annot_G1, GYgynogff = GYgynogff, 
                   mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G1 DMS")

outliers_G2_G1c_final <- which(abs(DMSlist$DMS_15pc_CC_CT$meth.diff) > 15 + 2*sd(abs(DMSlist$DMS_15pc_CC_CT$meth.diff)))
outliers_annot_G2_G1c <- as.data.frame(diffAnn_G2_controlG1@members)[outliers_G2_G1c_final,]
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_CC_CT[outliers_G2_G1c_final, ],
                   annotFile = outliers_annot_G2_G1c, GYgynogff = GYgynogff, 
                   mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1c DMS")

outliers_G2_G1i_final <- which(abs(DMSlist$DMS_15pc_TC_TT$meth.diff) > 15 + 2*sd(abs(DMSlist$DMS_15pc_TC_TT$meth.diff)))
outliers_annot_G2_G1i <- as.data.frame(diffAnn_G2_infectedG1@members)[outliers_G2_G1i_final,]
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_TC_TT[outliers_G2_G1i_final, ],
                   annotFile = outliers_annot_G2_G1i, GYgynogff = GYgynogff, 
                   mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1i DMS")

1.1.2 PARENTAL EFFECT: Differential methylation between control G2 (G1 infected or not) and between infected G2 (G1 infected of not)

1.2 Differential methylation by brother pair (sex as covariate)

  1. CC-TC = CONTROL fish (parent CvsT)
  2. CT-TT = TREATMENT fish (parent CvsT)
  3. CC-CT = fish from CONTROL parents (G2 CvsT)
  4. TC-TT = fish from TREATMENT parents (G2 CvsT)

TBC To be continued

## All in DMSBPlist
# 
# ## Add fam to bp
# names(DMSBPlist) <- paste0(plyr::join(data.frame(BP = names(DMSBPlist)),
#                                       unique(data.frame(BP = fullMetadata_OFFS$brotherPairID, 
#                                                         fam = fullMetadata_OFFS$Family)))[[2]], "_", names(DMSBPlist))

## Extract DMS (by position)
myPosList = lapply(DMSBPlist, lapply, function(x){paste(x$chr, x$end)})

## Find DMS present in at least 4 BP out of 8 (half):
get2keep = function(Compa){
  x <- lapply(myPosList, function(x){unlist(x[[paste0("DMS_15pc_BP_", Compa)]])})
  f <- table(unlist((x))) # each DMS present between 1 and 8 times
  tokeep <- names(f)[f >= 4]
  print(length(tokeep))
  ## Keep the DMS present in 4 families minimum
  DMSBPlist_INTER4 <- lapply(x, function(x){x[x %in% tokeep]})
  ## Reorder by family:
  DMSBPlist_INTER4 <- DMSBPlist_INTER4[names(DMSBPlist_INTER4)[order(names(DMSBPlist_INTER4))]]
  return(DMSBPlist_INTER4)
}

1.2.1 Attribute plot of these DMS in every BP

## Prepare df for complexUpset
getUpsetDF = function(i){ # for a given comparison
  A = get2keep(vecCompa[i]) 
  A2 = lapply(A, function(x){
    x = data.frame(x)    # vector of DMS as df
    names(x) = "DMS"    # name each CpG
    return(x)
  })
  ## Add BP name
  for (i in 1:length(names(A2))){
    A2[[i]]["BP"] = names(A2)[i]
  }
  # make a dataframe
  A2 = A2 %>% reduce(full_join, by = "DMS")
  # names column with BP id
  for (i in 2:ncol(A2)) {names(A2)[i] = unique(A2[!is.na(A2[i]), i])}
  # replace by 0 or 1 the DMS absence/presence
  A = data.frame(apply(A2[2:9], 2, function(x) ifelse(is.na(x), 0, 1)))
  # add DMS
  A$DMS = A2$DMS
  return(A)
}

## Vector of all 4 comparisons
vecCompa <- c("CC_TC", "CT_TT", "CC_CT", "TC_TT")
## Make upset plots
for (i in 1:4){
  df = getUpsetDF(i)
  print(ComplexUpset::upset(
    df,
    names(df)[1:8],
    width_ratio=0.1,
    sort_intersections_by=c('degree', 'cardinality'),
    queries=query_by_degree(
      df,  names(df)[1:8],
      params_by_degree=list(
        '1'=list(color='red', fill='red'),
        '2'=list(color='purple', fill='purple'),
        '3'=list(color='blue', fill='blue'),
        '4'=list(color='grey', fill='grey'),
        '5'=list(color='red', fill='red'),
        '6'=list(color='purple', fill='purple'),
        '7'=list(color='blue', fill='blue'),
        '8'=list(color='green', fill='green')
      ),
      only_components=c("intersections_matrix", "Intersection size")
    )))
}
## [1] 1099

## [1] 1257

## [1] 329

## [1] 341

1.2.2 Get annotation of these DMS present in at least 4 BP

plotManhattanGenes <- function(i){
  print(paste0("The number of DMS in the ", vecCompa[i] ," comparison is:"))
  DMSvec = unique(unlist(get2keep(vecCompa[i])))
  # Annotate the DMS present in at least 4 BP: 
  # Change the vector into a methobject:
  df <- data.frame(chr=sapply(strsplit(DMSvec, " "), `[`, 1), 
                   start=sapply(strsplit(DMSvec, " "), `[`, 2),
                   end=sapply(strsplit(DMSvec, " "), `[`, 2)) 
  # get annotation
  anot4BP <- getAnnotationFun(makeGRangesFromDataFrame(df))
  print(paste0("The number of genes with DMS in the ", vecCompa[i] ," comparison is:"))
  print(nrow(anot4BP)) # number of genes
  # Reorder chromosome
  anot4BP <- anot4BP %>% 
    mutate(chr = gsub("Gy_chr", "", seqid), chrom_nr = seqid %>% deroman(), chrom_order=factor(chrom_nr) %>% 
             as.numeric()) %>% arrange(chrom_order)  %>% 
    mutate(geneLengthkb = (end - start)/1000, nCpGperGenekb = nCpG/geneLengthkb)
  
  # Plot
  plot = ggplot(anot4BP, aes(x=start, y = nCpGperGenekb)) + geom_point(alpha=.4) +
    facet_grid(.~fct_inorder(chr)) +
    geom_hline(yintercept = 1)+
    theme(axis.text.x=element_blank()) +
    xlab(paste0("Genes with DMS present in at least 4 brother pairs\nComparison: ", vecCompa[i])) +
    ylab("Number of differentially methylated CpG per gene kb")
  
  ## Genes with at least 1 CpG differentially methylated per kb:
  topGenes = anot4BP[anot4BP$nCpGperGenekb >= 1,]
  return(list(plot = plot, anot4BP = anot4BP, topGenes = topGenes))
}

plotManhattanGenes(1)$plot
## [1] "The number of DMS in the CC_TC comparison is:"
## [1] 1099
## [1] "The number of genes with DMS in the CC_TC comparison is:"
## [1] 432

plotManhattanGenes(2)$plot
## [1] "The number of DMS in the CT_TT comparison is:"
## [1] 1257
## [1] "The number of genes with DMS in the CT_TT comparison is:"
## [1] 547

plotManhattanGenes(3)$plot
## [1] "The number of DMS in the CC_CT comparison is:"
## [1] 329
## [1] "The number of genes with DMS in the CC_CT comparison is:"
## [1] 144

plotManhattanGenes(4)$plot
## [1] "The number of DMS in the TC_TT comparison is:"
## [1] 341
## [1] "The number of genes with DMS in the TC_TT comparison is:"
## [1] 171

1.3 Gene Ontology analysis:

  • Gene universe: all genes which are present in the dataset
  • Gene sub-universe: all genes in DMS in the GOterm universe we only want genes with (1) DMS between G2 exposed to parasites, by brother pair, present in AT LEAST 4 BP/8 for which (2) CpGs were covered, and (3) which have a GOterm attributed
# create gene universe
gene_universe <- data.frame(
  subsetByOverlaps(GRanges(annotGff3), GRanges(uniteCov14_G2_woSexAndUnknowChrOVERLAP))) %>% # subselect covered CpGs
  filter(lengths(Ontology_term)!=0) %>% # rm non existing GO terms
  filter(type %in% "gene")  %>% # keep all the 7416 genes with GO terms
  dplyr::select(c("Name", "Ontology_term")) %>%
  mutate(go_linkage_type = "IEA") %>% #NB: IEA but not necessarily true, it's from Interproscan after Maker. Sticklebacks (biomart) have 82701 IEA and 63 ISS.
  relocate("Ontology_term","go_linkage_type","Name") %>%
  unnest(Ontology_term) %>% # one GO per line (was a list before in this column)
  data.frame()

# Create gene set collection
goFrame <- GOFrame(gene_universe, organism="Gasterosteus aculeatus")
goAllFrame <- GOAllFrame(goFrame)
gsc_universe <- GeneSetCollection(goAllFrame, setType = GOCollection())

IMPORTANT NOTE from Mel: why conditional hypergeometric test? The GO ontology is set up as a directed acyclic graph, where a parent term is comprised of all its child terms. If you do a standard hypergeometric, you might e.g., find ‘positive regulation of kinase activity’ to be significant. If you then test ‘positive regulation of catalytic activity’, which is a parent term, then it might be significant as well, but only because of the terms coming from positive regulation of kinase activity.

The conditional hypergeometric takes this into account, and only uses those terms that were not already significant when testing a higher order (parent) term.

## For each comparison:
A=makedfGO(1, gene_universe); B=makedfGO(2, gene_universe); C=makedfGO(3, gene_universe); D=makedfGO(4, gene_universe)
## [1] "The number of DMS in the CC_TC comparison is:"
## [1] 1099
## [1] "The number of genes with DMS in the CC_TC comparison is:"
## [1] 432
## [1] "The number of DMS in the CT_TT comparison is:"
## [1] 1257
## [1] "The number of genes with DMS in the CT_TT comparison is:"
## [1] 547
## [1] "The number of DMS in the CC_CT comparison is:"
## [1] 329
## [1] "The number of genes with DMS in the CC_CT comparison is:"
## [1] 144
## [1] "The number of DMS in the TC_TT comparison is:"
## [1] 341
## [1] "The number of genes with DMS in the TC_TT comparison is:"
## [1] 171
dfGO = rbind(A, B, C, D)

# add type of comparison:
dfGO$group = "Different parental treatment"
dfGO$group[dfGO$Comp %in% c("CC_CT", "TC_TT")] = "Different offpring treatment"

1.3.1 GO plot

dfGO %>% ggplot(aes(x=Comp, y = factor(GO.name))) +
  geom_point(aes(color = p.value.adjusted, size = genePercent)) +
  scale_color_gradient(name="adjusted\np-value", low = "red", high = "blue") +
  scale_size_continuous(name = "% of genes")+
  theme_bw() + ylab("") + xlab("Treatments comparison") +
  theme(legend.box.background = element_rect(fill = "#ebebeb", color = "#ebebeb"), 
        legend.background = element_rect(fill = "#ebebeb", color = "#ebebeb"), 
        legend.key = element_rect(fill = "#ebebeb", color = "#ebebeb"), legend.position="left") + # grey box for legend
  facet_grid(fct_inorder(GO.category)~group, scales="free",space = "free") 

1.3.2 Which genes have at least 1 CpG differentially methylated per kb:

getGeneSummary <- function(i){
  geneNotes = plotManhattanGenes(i)$topGenes$Note
  # extract uniprot symbol from note, then uppercase
  genes = sapply(geneNotes, function(x) sub("Similar to ", "", x) %>% str_extract(".*:") %>% str_remove(":") %>% toupper) 
  # Convert the uniprot gene names to entrez ids
  topGenesDF = unlist(mapIds(org.Hs.eg.db, keys = genes, column = "ENTREZID", keytype = "SYMBOL")) %>% data.frame() 
  names(topGenesDF) = "ENTREZID" ; topGenesDF$GeneSymbol = rownames(topGenesDF) ; rownames(topGenesDF) = NULL
  # Retrieve gene summary & descirption
  genes = entrez_summary(db="gene", id=topGenesDF$ENTREZID)
  SummaDF = lapply(genes, function(x) x[["summary"]]) %>% unlist() %>% data.frame()
  names(SummaDF) = "Summary" ; SummaDF$ENTREZID = rownames(SummaDF) ; rownames(SummaDF) = NULL
  DescDF = lapply(genes, function(x) x[["description"]]) %>% unlist() %>% data.frame()
  names(DescDF) = "description" ; DescDF$ENTREZID = rownames(DescDF) ; rownames(DescDF) = NULL
  # Output complete table
  topGenesDF = merge(merge(DescDF, topGenesDF, all = T), SummaDF, all = T)
  # Add which comparison
  topGenesDF$comparison = vecCompa[i]
  return(topGenesDF)
}

[1] “The number of DMS in the CC_TC comparison is:” [1] 1099 [1] “The number of genes with DMS in the CC_TC comparison is:” [1] 432

ENTREZID description GeneSymbol Summary comparison
10873 malic enzyme 3 ME3 Malic enzyme catalyzes the oxidative decarboxylation of malate to pyruvate using either NAD+ or NADP+ as a cofactor. Mammalian tissues contain 3 distinct isoforms of malic enzyme: a cytosolic NADP(+)-dependent isoform, a mitochondrial NADP(+)-dependent isoform, and a mitochondrial NAD(+)-dependent isoform. This gene encodes a mitochondrial NADP(+)-dependent isoform. Multiple alternatively spliced transcript variants have been found for this gene, but the biological validity of some variants has not been determined. [provided by RefSeq, Jul 2008] CC_TC
10887 prokineticin receptor 1 PROKR1 This gene encodes a member of the G-protein-coupled receptor family. The encoded protein binds to prokineticins (1 and 2), leading to the activation of MAPK and STAT signaling pathways. Prokineticins are protein ligands involved in angiogenesis and inflammation. The encoded protein is expressed in peripheral tissues such as those comprising the circulatory system, lungs, reproductive system, endocrine system and the gastrointestinal system. The protein may be involved in signaling in human fetal ovary during initiation of primordial follicle formation. Sequence variants in this gene may be associated with recurrent miscarriage. [provided by RefSeq, Aug 2016] CC_TC
11135 CDC42 effector protein 1 CDC42EP1 CDC42 is a member of the Rho GTPase family that regulates multiple cellular activities, including actin polymerization. The protein encoded by this gene is a CDC42 binding protein that mediates actin cytoskeleton reorganization at the plasma membrane. This protein is secreted and is primarily found in bone marrow. [provided by RefSeq, Jul 2008] CC_TC
221421 radial spoke head component 9 RSPH9 This gene encodes a protein thought to be a component of the radial spoke head in motile cilia and flagella. Mutations in this gene are associated with primary ciliary dyskinesia 12. Alternative splicing results in multiple transcript variants.[provided by RefSeq, Jul 2010] CC_TC
2287 FKBP prolyl isomerase 3 FKBP3 The protein encoded by this gene is a member of the immunophilin protein family, which play a role in immunoregulation and basic cellular processes involving protein folding and trafficking. This encoded protein is a cis-trans prolyl isomerase that binds the immunosuppressants FK506 and rapamycin, as well as histone deacetylases, the transcription factor YY1, casein kinase II, and nucleolin. It has a higher affinity for rapamycin than for FK506 and thus may be an important target molecule for immunosuppression by rapamycin. [provided by RefSeq, Sep 2008] CC_TC
2831 neuropeptides B and W receptor 1 NPBWR1 CC_TC
3040 hemoglobin subunit alpha 2 HBA2 The human alpha globin gene cluster located on chromosome 16 spans about 30 kb and includes seven loci: 5’- zeta - pseudozeta - mu - pseudoalpha-1 - alpha-2 - alpha-1 - theta - 3’. The alpha-2 (HBA2) and alpha-1 (HBA1) coding sequences are identical. These genes differ slightly over the 5’ untranslated regions and the introns, but they differ significantly over the 3’ untranslated regions. Two alpha chains plus two beta chains constitute HbA, which in normal adult life comprises about 97% of the total hemoglobin; alpha chains combine with delta chains to constitute HbA-2, which with HbF (fetal hemoglobin) makes up the remaining 3% of adult hemoglobin. Alpha thalassemias result from deletions of each of the alpha genes as well as deletions of both HBA2 and HBA1; some nondeletion alpha thalassemias have also been reported. [provided by RefSeq, Jul 2008] CC_TC
3763 potassium inwardly rectifying channel subfamily J member 6 KCNJ6 This gene encodes a member of the G protein-coupled inwardly-rectifying potassium channel family of inward rectifier potassium channels. This type of potassium channel allows a greater flow of potassium into the cell than out of it. These proteins modulate many physiological processes, including heart rate in cardiac cells and circuit activity in neuronal cells, through G-protein coupled receptor stimulation. Mutations in this gene are associated with Keppen-Lubinsky Syndrome, a rare condition characterized by severe developmental delay, facial dysmorphism, and intellectual disability. [provided by RefSeq, Apr 2015] CC_TC
3982 lens intrinsic membrane protein 2 LIM2 This gene encodes an eye lens-specific protein found at the junctions of lens fiber cells, where it may contribute to cell junctional organization. It acts as a receptor for calmodulin, and may play an important role in both lens development and cataractogenesis. Mutations in this gene have been associated with cataract formation. Alternatively spliced transcript variants encoding different isoforms have been found for this gene.[provided by RefSeq, Sep 2009] CC_TC
407738 TAFA chemokine like family member 1 TAFA1 This gene is a member of the TAFA family which is composed of five highly homologous genes that encode small secreted proteins. These proteins contain conserved cysteine residues at fixed positions, and are distantly related to MIP-1alpha, a member of the CC-chemokine family. The TAFA proteins are predominantly expressed in specific regions of the brain, and are postulated to function as brain-specific chemokines or neurokines that act as regulators of immune and nervous cells. [provided by RefSeq, Jul 2008] CC_TC
4099 myelin associated glycoprotein MAG The protein encoded by this gene is a type I membrane protein and member of the immunoglobulin superfamily. It is thought to be involved in the process of myelination. It is a lectin that binds to sialylated glycoconjugates and mediates certain myelin-neuron cell-cell interactions. Three alternatively spliced transcripts encoding different isoforms have been described for this gene. [provided by RefSeq, Nov 2010] CC_TC
440738 microtubule associated protein 1 light chain 3 gamma MAP1LC3C Autophagy is a highly regulated bulk degradation process that plays an important role in cellular maintenance and development. MAP1LC3C is an ortholog of the yeast autophagosome protein Atg8 (He et al., 2003 [PubMed 12740394]).[supplied by OMIM, Nov 2010] CC_TC
51305 potassium two pore domain channel subfamily K member 9 KCNK9 This gene encodes a protein that contains multiple transmembrane regions and two pore-forming P domains and functions as a pH-dependent potassium channel. Amplification and overexpression of this gene have been observed in several types of human carcinomas. This gene is imprinted in the brain, with preferential expression from the maternal allele. A mutation in this gene was associated with Birk-Barel dysmorphism syndrome. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Jul 2017] CC_TC
51647 cytosolic iron-sulfur assembly component 2B CIAO2B CC_TC
55603 terminal nucleotidyltransferase 5A TENT5A CC_TC
64747 major facilitator superfamily domain containing 1 MFSD1 CC_TC
650 bone morphogenetic protein 2 BMP2 This gene encodes a secreted ligand of the TGF-beta (transforming growth factor-beta) superfamily of proteins. Ligands of this family bind various TGF-beta receptors leading to recruitment and activation of SMAD family transcription factors that regulate gene expression. The encoded preproprotein is proteolytically processed to generate each subunit of the disulfide-linked homodimer, which plays a role in bone and cartilage development. Duplication of a regulatory region downstream of this gene causes a form of brachydactyly characterized by a malformed index finger and second toe in human patients. [provided by RefSeq, Jul 2016] CC_TC
7431 vimentin VIM This gene encodes a type III intermediate filament protein. Intermediate filaments, along with microtubules and actin microfilaments, make up the cytoskeleton. The encoded protein is responsible for maintaining cell shape and integrity of the cytoplasm, and stabilizing cytoskeletal interactions. This protein is involved in neuritogenesis and cholesterol transport and functions as an organizer of a number of other critical proteins involved in cell attachment, migration, and signaling. Bacterial and viral pathogens have been shown to attach to this protein on the host cell surface. Mutations in this gene are associated with congenital cataracts in human patients. [provided by RefSeq, Aug 2017] CC_TC
7528 YY1 transcription factor YY1 YY1 is a ubiquitously distributed transcription factor belonging to the GLI-Kruppel class of zinc finger proteins. The protein is involved in repressing and activating a diverse number of promoters. YY1 may direct histone deacetylases and histone acetyltransferases to a promoter in order to activate or repress the promoter, thus implicating histone modification in the function of YY1. [provided by RefSeq, Jul 2008] CC_TC
84163 GTF2I repeat domain containing 2 GTF2IRD2 This gene is one of several closely related genes on chromosome 7 encoding proteins containing helix-loop-helix motifs. These proteins may function as regulators of transcription. The encoded protein is unique in that its C-terminus is derived from CHARLIE8 transposable element sequence. This gene is located in a region of chromosome 7 that is deleted in Williams-Beuren syndrome, and loss of this locus may contribute to the cognitive phenotypes observed in this disease. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Jul 2013] CC_TC
8492 serine protease 12 PRSS12 This gene encodes a member of the trypsin family of serine proteases and contains a signal peptide, a proline-rich region, a Kringle domain, four scavenger receptor cysteine-rich domains, and a trypsin-like serine protease domain. The protein, sometimes referred to as neurotrypsin or motopsin, is secreted from neuronal cells and localizes to the synaptic cleft. Studies in mice show that this protein cleaves a protein, agrin, that is important for the formation and maintenance of exitatory synapses. Defects in this gene cause a form of autosomal recessive cognitive impairment (MRT1). [provided by RefSeq, Jul 2017] CC_TC
8526 diacylglycerol kinase epsilon DGKE Diacylglycerol kinases are thought to be involved mainly in the regeneration of phosphatidylinositol (PI) from diacylglycerol in the PI-cycle during cell signal transduction. When expressed in mammalian cells, DGK-epsilon shows specificity for arachidonyl-containing diacylglycerol. DGK-epsilon is expressed predominantly in testis. [provided by RefSeq, Jul 2008] CC_TC
9403 selenoprotein F SELENOF The protein encoded by this gene belongs to the SEP15/selenoprotein M family. The exact function of this protein is not known; however, it has been found to associate with UDP-glucose:glycoprotein glucosyltransferase (UGTR), an endoplasmic reticulum(ER)-resident protein, which is involved in the quality control of protein folding. The association with UGTR retains this protein in the ER, where it may play a role in protein folding. It has also been suggested to have a role in cancer etiology. This protein is a selenoprotein, containing the rare amino acid selenocysteine (Sec). Sec is encoded by the UGA codon, which normally signals translation termination. The 3’ UTRs of selenoprotein mRNAs contain a conserved stem-loop structure, designated the Sec insertion sequence (SECIS) element, that is necessary for the recognition of UGA as a Sec codon, rather than as a stop signal. Alternatively spliced transcript variants have been found for this gene. [provided by RefSeq, Nov 2016] CC_TC
NA NA RASGEF1BB NA CC_TC

[1] “The number of DMS in the CT_TT comparison is:” [1] 1257 [1] “The number of genes with DMS in the CT_TT comparison is:” [1] 547

ENTREZID description GeneSymbol Summary comparison
114794 extracellular leucine rich repeat and fibronectin type III domain containing 2 ELFN2 CT_TT
140838 N-acetylneuraminic acid phosphatase NANP CT_TT
145864 hyaluronan and proteoglycan link protein 3 HAPLN3 This gene belongs to the hyaluronan and proteoglycan binding link protein gene family. The protein encoded by this gene may function in hyaluronic acid binding and cell adhesion. [provided by RefSeq, Jul 2008] CT_TT
150383 cysteine rich DPF motif domain containing 1 CDPF1 CT_TT
158787 RIB43A domain with coiled-coils 1 RIBC1 CT_TT
170591 S100 calcium binding protein Z S100Z Members of the S100 protein family contain 2 calcium-binding EF-hands and exhibit cell-type specific expression patterns. For additional background information on S100 proteins, see MIM 114085.[supplied by OMIM, Mar 2008] CT_TT
221391 opsin 5 OPN5 Opsins are members of the guanine nucleotide-binding protein (G protein)-coupled receptor superfamily. This opsin gene is expressed in the eye, brain, testes, and spinal cord. This gene belongs to the seven-exon subfamily of mammalian opsin genes that includes peropsin (RRH) and retinal G protein coupled receptor (RGR). Like these other seven-exon opsin genes, this family member may encode a protein with photoisomerase activity. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Jun 2010] CT_TT
2287 FKBP prolyl isomerase 3 FKBP3 The protein encoded by this gene is a member of the immunophilin protein family, which play a role in immunoregulation and basic cellular processes involving protein folding and trafficking. This encoded protein is a cis-trans prolyl isomerase that binds the immunosuppressants FK506 and rapamycin, as well as histone deacetylases, the transcription factor YY1, casein kinase II, and nucleolin. It has a higher affinity for rapamycin than for FK506 and thus may be an important target molecule for immunosuppression by rapamycin. [provided by RefSeq, Sep 2008] CT_TT
2535 frizzled class receptor 2 FZD2 This intronless gene is a member of the frizzled gene family. Members of this family encode seven-transmembrane domain proteins that are receptors for the wingless type MMTV integration site family of signaling proteins. This gene encodes a protein that is coupled to the beta-catenin canonical signaling pathway. Competition between the wingless-type MMTV integration site family, member 3A and wingless-type MMTV integration site family, member 5A gene products for binding of this protein is thought to regulate the beta-catenin-dependent and -independent pathways. [provided by RefSeq, Dec 2010] CT_TT
26164 mitochondrial ribosome associated GTPase 2 MTG2 Small G proteins, such as GTPBP5, act as molecular switches that play crucial roles in the regulation of fundamental cellular processes such as protein synthesis, nuclear transport, membrane trafficking, and signal transduction (Hirano et al., 2006 [PubMed 17054726]).[supplied by OMIM, Mar 2008] CT_TT
26531 olfactory receptor family 11 subfamily A member 1 OR11A1 Olfactory receptors interact with odorant molecules in the nose, to initiate a neuronal response that triggers the perception of a smell. The olfactory receptor proteins are members of a large family of G-protein-coupled receptors (GPCR) arising from single coding-exon genes. Olfactory receptors share a 7-transmembrane domain structure with many neurotransmitter and hormone receptors and are responsible for the recognition and G protein-mediated transduction of odorant signals. The olfactory receptor gene family is the largest in the genome. The nomenclature assigned to the olfactory receptor genes and proteins for this organism is independent of other organisms. [provided by RefSeq, Jul 2008] CT_TT
29094 galectin like LGALSL CT_TT
3040 hemoglobin subunit alpha 2 HBA2 The human alpha globin gene cluster located on chromosome 16 spans about 30 kb and includes seven loci: 5’- zeta - pseudozeta - mu - pseudoalpha-1 - alpha-2 - alpha-1 - theta - 3’. The alpha-2 (HBA2) and alpha-1 (HBA1) coding sequences are identical. These genes differ slightly over the 5’ untranslated regions and the introns, but they differ significantly over the 3’ untranslated regions. Two alpha chains plus two beta chains constitute HbA, which in normal adult life comprises about 97% of the total hemoglobin; alpha chains combine with delta chains to constitute HbA-2, which with HbF (fetal hemoglobin) makes up the remaining 3% of adult hemoglobin. Alpha thalassemias result from deletions of each of the alpha genes as well as deletions of both HBA2 and HBA1; some nondeletion alpha thalassemias have also been reported. [provided by RefSeq, Jul 2008] CT_TT
4056 leukotriene C4 synthase LTC4S The MAPEG (Membrane Associated Proteins in Eicosanoid and Glutathione metabolism) family includes a number of human proteins, several of which are involved the production of leukotrienes. This gene encodes an enzyme that catalyzes the first step in the biosynthesis of cysteinyl leukotrienes, potent biological compounds derived from arachidonic acid. Leukotrienes have been implicated as mediators of anaphylaxis and inflammatory conditions such as human bronchial asthma. This protein localizes to the nuclear envelope and adjacent endoplasmic reticulum. [provided by RefSeq, Jul 2008] CT_TT
4160 melanocortin 4 receptor MC4R The protein encoded by this gene is a membrane-bound receptor and member of the melanocortin receptor family. The encoded protein interacts with adrenocorticotropic and MSH hormones and is mediated by G proteins. This is an intronless gene. Defects in this gene are a cause of autosomal dominant obesity. [provided by RefSeq, Jan 2010] CT_TT
440738 microtubule associated protein 1 light chain 3 gamma MAP1LC3C Autophagy is a highly regulated bulk degradation process that plays an important role in cellular maintenance and development. MAP1LC3C is an ortholog of the yeast autophagosome protein Atg8 (He et al., 2003 [PubMed 12740394]).[supplied by OMIM, Nov 2010] CT_TT
4736 ribosomal protein L10a RPL10A Ribosomes, the organelles that catalyze protein synthesis, consist of a small 40S subunit and a large 60S subunit. Together these subunits are composed of 4 RNA species and approximately 80 structurally distinct proteins. This gene encodes a ribosomal protein that is a component of the 60S subunit. The protein belongs to the L1P family of ribosomal proteins. It is located in the cytoplasm. The expression of this gene is downregulated in the thymus by cyclosporin-A (CsA), an immunosuppressive drug. Studies in mice have shown that the expression of the ribosomal protein L10a gene is downregulated in neural precursor cells during development. This gene previously was referred to as NEDD6 (neural precursor cell expressed, developmentally downregulated 6), but it has been renamed RPL10A (ribosomal protein 10a). As is typical for genes encoding ribosomal proteins, there are multiple processed pseudogenes of this gene dispersed through the genome. [provided by RefSeq, Jul 2008] CT_TT
51647 cytosolic iron-sulfur assembly component 2B CIAO2B CT_TT
5251 phosphate regulating endopeptidase homolog X-linked PHEX The protein encoded by this gene is a transmembrane endopeptidase that belongs to the type II integral membrane zinc-dependent endopeptidase family. The protein is thought to be involved in bone and dentin mineralization and renal phosphate reabsorption. Mutations in this gene cause X-linked hypophosphatemic rickets. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Sep 2013] CT_TT
55584 cholinergic receptor nicotinic alpha 9 subunit CHRNA9 This gene is a member of the ligand-gated ionic channel family and nicotinic acetylcholine receptor gene superfamily. It encodes a plasma membrane protein that forms homo- or hetero-oligomeric divalent cation channels. This protein is involved in cochlea hair cell development and is also expressed in the outer hair cells (OHCs) of the adult cochlea. [provided by RefSeq, Feb 2012] CT_TT
5603 mitogen-activated protein kinase 13 MAPK13 This gene encodes a member of the mitogen-activated protein (MAP) kinase family. MAP kinases act as an integration point for multiple biochemical signals, and are involved in a wide variety of cellular processes such as proliferation, differentiation, transcription regulation and development. The encoded protein is a p38 MAP kinase and is activated by proinflammatory cytokines and cellular stress. Substrates of the encoded protein include the transcription factor ATF2 and the microtubule dynamics regulator stathmin. Alternatively spliced transcript variants have been observed for this gene. [provided by RefSeq, Jul 2012] CT_TT
83641 family with sequence similarity 107 member B FAM107B CT_TT
85460 zinc finger protein 518B ZNF518B CT_TT
85462 FH2 domain containing 1 FHDC1 CT_TT
9775 eukaryotic translation initiation factor 4A3 EIF4A3 This gene encodes a member of the DEAD box protein family. DEAD box proteins, characterized by the conserved motif Asp-Glu-Ala-Asp (DEAD), are putative RNA helicases. They are implicated in a number of cellular processes involving alteration of RNA secondary structure, such as translation initiation, nuclear and mitochondrial splicing, and ribosome and spliceosome assembly. Based on their distribution patterns, some members of this family are believed to be involved in embryogenesis, spermatogenesis, and cellular growth and division. The protein encoded by this gene is a nuclear matrix protein. Its amino acid sequence is highly similar to the amino acid sequences of the translation initiation factors eIF4AI and eIF4AII, two other members of the DEAD box protein family. [provided by RefSeq, Jul 2008] CT_TT
NA NA MNCB-2990 NA CT_TT
NA NA ACBD5A NA CT_TT
NA NA HOXB9A NA CT_TT

[1] “The number of DMS in the CC_CT comparison is:” [1] 329 [1] “The number of genes with DMS in the CC_CT comparison is:” [1] 144

ENTREZID description GeneSymbol Summary comparison
10626 tripartite motif containing 16 TRIM16 The protein encoded by this gene is a tripartite motif (TRIM) family member that contains two B box domains and a coiled-coiled region that are characteristic of the B box zinc finger protein family. While it lacks a RING domain found in other TRIM proteins, the encoded protein can homodimerize or heterodimerize with other TRIM proteins and has E3 ubiquitin ligase activity. This gene is also a tumor suppressor and is involved in secretory autophagy. [provided by RefSeq, Jan 2017] CC_CT
1113 chromogranin A CHGA The protein encoded by this gene is a member of the chromogranin/secretogranin family of neuroendocrine secretory proteins. It is found in secretory vesicles of neurons and endocrine cells. This gene product is a precursor to three biologically active peptides; vasostatin, pancreastatin, and parastatin. These peptides act as autocrine or paracrine negative modulators of the neuroendocrine system. Two other peptides, catestatin and chromofungin, have antimicrobial activity and antifungal activity, respectively. Two transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, Sep 2014] CC_CT
150383 cysteine rich DPF motif domain containing 1 CDPF1 CC_CT
221078 NOP2/Sun RNA methyltransferase 6 NSUN6 CC_CT
26531 olfactory receptor family 11 subfamily A member 1 OR11A1 Olfactory receptors interact with odorant molecules in the nose, to initiate a neuronal response that triggers the perception of a smell. The olfactory receptor proteins are members of a large family of G-protein-coupled receptors (GPCR) arising from single coding-exon genes. Olfactory receptors share a 7-transmembrane domain structure with many neurotransmitter and hormone receptors and are responsible for the recognition and G protein-mediated transduction of odorant signals. The olfactory receptor gene family is the largest in the genome. The nomenclature assigned to the olfactory receptor genes and proteins for this organism is independent of other organisms. [provided by RefSeq, Jul 2008] CC_CT
3040 hemoglobin subunit alpha 2 HBA2 The human alpha globin gene cluster located on chromosome 16 spans about 30 kb and includes seven loci: 5’- zeta - pseudozeta - mu - pseudoalpha-1 - alpha-2 - alpha-1 - theta - 3’. The alpha-2 (HBA2) and alpha-1 (HBA1) coding sequences are identical. These genes differ slightly over the 5’ untranslated regions and the introns, but they differ significantly over the 3’ untranslated regions. Two alpha chains plus two beta chains constitute HbA, which in normal adult life comprises about 97% of the total hemoglobin; alpha chains combine with delta chains to constitute HbA-2, which with HbF (fetal hemoglobin) makes up the remaining 3% of adult hemoglobin. Alpha thalassemias result from deletions of each of the alpha genes as well as deletions of both HBA2 and HBA1; some nondeletion alpha thalassemias have also been reported. [provided by RefSeq, Jul 2008] CC_CT
440738 microtubule associated protein 1 light chain 3 gamma MAP1LC3C Autophagy is a highly regulated bulk degradation process that plays an important role in cellular maintenance and development. MAP1LC3C is an ortholog of the yeast autophagosome protein Atg8 (He et al., 2003 [PubMed 12740394]).[supplied by OMIM, Nov 2010] CC_CT
51647 cytosolic iron-sulfur assembly component 2B CIAO2B CC_CT
55040 epsin 3 EPN3 CC_CT
84513 phospholipid phosphatase 5 PLPP5 CC_CT

[1] “The number of DMS in the TC_TT comparison is:” [1] 341 [1] “The number of genes with DMS in the TC_TT comparison is:” [1] 171

ENTREZID description GeneSymbol Summary comparison
26001 ring finger protein 167 RNF167 RNF167 is an E3 ubiquitin ligase that interacts with TSSC5 (SLC22A18; MIM 602631) and, together with UBCH6 (UBE2E1; MIM 602916), facilitates TSSC5 polyubiquitylation (Yamada and Gorbsky, 2006 [PubMed 16314844]).[supplied by OMIM, Mar 2008] TC_TT
3982 lens intrinsic membrane protein 2 LIM2 This gene encodes an eye lens-specific protein found at the junctions of lens fiber cells, where it may contribute to cell junctional organization. It acts as a receptor for calmodulin, and may play an important role in both lens development and cataractogenesis. Mutations in this gene have been associated with cataract formation. Alternatively spliced transcript variants encoding different isoforms have been found for this gene.[provided by RefSeq, Sep 2009] TC_TT
4247 alpha-1,6-mannosyl-glycoprotein 2-beta-N-acetylglucosaminyltransferase MGAT2 The product of this gene is a Golgi enzyme catalyzing an essential step in the conversion of oligomannose to complex N-glycans. The enzyme has the typical glycosyltransferase domains: a short N-terminal cytoplasmic domain, a hydrophobic non-cleavable signal-anchor domain, and a C-terminal catalytic domain. Mutations in this gene may lead to carbohydrate-deficient glycoprotein syndrome, type II. The coding region of this gene is intronless. Transcript variants with a spliced 5’ UTR may exist, but their biological validity has not been determined. [provided by RefSeq, Jul 2008] TC_TT
440738 microtubule associated protein 1 light chain 3 gamma MAP1LC3C Autophagy is a highly regulated bulk degradation process that plays an important role in cellular maintenance and development. MAP1LC3C is an ortholog of the yeast autophagosome protein Atg8 (He et al., 2003 [PubMed 12740394]).[supplied by OMIM, Nov 2010] TC_TT
51058 zinc finger protein 691 ZNF691 TC_TT
6158 ribosomal protein L28 RPL28 Ribosomes, the organelles that catalyze protein synthesis, consist of a small 40S subunit and a large 60S subunit. Together these subunits are composed of 4 RNA species and approximately 80 structurally distinct proteins. This gene encodes a ribosomal protein that is a component of the 60S subunit. The protein belongs to the L28E family of ribosomal proteins. It is located in the cytoplasm. Variable expression of this gene in colorectal cancers compared to adjacent normal tissues has been observed, although no correlation between the level of expression and the severity of the disease has been found. As is typical for genes encoding ribosomal proteins, there are multiple processed pseudogenes of this gene dispersed through the genome. Alternative splicing results in multiple transcript variants encoding distinct isoforms.[provided by RefSeq, Oct 2008] TC_TT
64866 CUB domain containing protein 1 CDCP1 This gene encodes a transmembrane protein which contains three extracellular CUB domains and acts as a substrate for Src family kinases. The protein plays a role in the tyrosine phosphorylation-dependent regulation of cellular events that are involved in tumor invasion and metastasis. Alternative splicing results in multiple transcript variants of this gene. [provided by RefSeq, May 2013] TC_TT
85460 zinc finger protein 518B ZNF518B TC_TT

2 Top down analysis: Differential Methylation in parents -> how do they look in offspring?

##############################################################
#### I. Focus on CpG positions at parental (Ctrl-Inf) DMS ####
##############################################################

####################################################################################
#### Question: how are the beta values in the different groups at the parental DMS?##
####################################################################################
##############
## Prepare dataset
##############
PM_G1 <- getPMdataset(uniteCov = uniteCov6_G1_woSexAndUnknowChrOVERLAP, MD = fullMetadata_PAR, gener="parents")
PM_G2 <- getPMdataset(uniteCov = uniteCov14_G2_woSexAndUnknowChrOVERLAP, MD = fullMetadata_OFFS, gener="offspring")

head(PM_G1)
head(PM_G2)

table(fullMetadata_OFFS$trtG1G2, fullMetadata_OFFS$clutch.ID)

## What is the relative contribution of methylation to brother pair & paternal treatment?
## Test of VCA: variance component analysis https://cran.r-project.org/web/packages/VCA/vignettes/VCA_package_vignette.html

## Hypo
PM_G2_mean_hypo <- PM_G2[PM_G2$hypohyper %in% "hypo", ] %>% 
  group_by(brotherPairID, G1_trt, G2_trt, ID) %>% 
  dplyr::summarize(MeanBetaValue = mean(BetaValue, na.rm=TRUE)) %>% data.frame()

varPlot(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hypo, 
        MeanLine=list(var=c("G1_trt", "G2_trt"), 
                      col=c("white", "blue"), lwd=c(2,2)), 
        BG=list(var="G2_trt", col=paste0("gray", c(80, 90))),
        YLabel=list(cex = .8, text="Mean beta value at parDMS \n hypomethylated upon infection"))

myfitVCA_hypo <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hypo) 

### Real values
trtEffect <- sum(myfitVCA_hypo$aov.tab[2:4, 5])
genEffect <- sum(myfitVCA_hypo$aov.tab[5:8, 5])
error <- sum(myfitVCA_hypo$aov.tab[9, 5])
realValHypoVCA <- data.frame(trtEffect=trtEffect, genEffect=genEffect,error=error)

### Randomisation
myrandomVCA <- function(df=PM_G2_mean_hypo){
  randomDF = df
  randomDF$G1_trt = sample(PM_G2_mean_hypo$G1_trt, replace = F)
  randomDF$G2_trt = sample(PM_G2_mean_hypo$G2_trt, replace = F)
  randomDF$brotherPairID = sample(PM_G2_mean_hypo$brotherPairID, replace = F)
  myfitVCA <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = randomDF) 
  trtEffect <- sum(myfitVCA$aov.tab[2:4, 5])
  genEffect <- sum(myfitVCA$aov.tab[5:8, 5])
  error <- sum(myfitVCA$aov.tab[9, 5])
  return(data.frame(trtEffect=trtEffect, genEffect=genEffect,error=error))
}

randomHypoVCA = do.call(rbind, lapply(1:1000, function(x) {
  df=myrandomVCA(PM_G2_mean_hypo)
  df$rep=x
  return(df)}))

randomHypoVCA = melt(randomHypoVCA, id.vars = "rep")
# saveRDS(randomHypoVCA, file = "Rdata/randomHypoVCA.RDS")
randomHypoVCA <- readRDS(file = "Rdata/randomHypoVCA.RDS")
df2=reshape2::melt(realValHypoVCA)

sumDF <- randomHypoVCA %>% 
  group_by(variable) %>%
  dplyr::summarize(value = mean(value)) %>% data.frame()

ggplot(randomHypoVCA, aes(x=variable, y=value))+
  geom_boxplot()+
  geom_jitter(width=.1, alpha=.2)+
  geom_point(data = df2, col = "red", size = 6)+
  geom_text(data=sumDF, aes(label=round(value)), col="white")+
  geom_text(data = df2, aes(label=round(value)), col="white")+
  theme_cleveland()+
  ggtitle("VCA with bootstrap N=1000 at hypo-parDMS", subtitle = "red: observed values")

# estimate 95% confidence intervals, request CI for
# all variance components via 'VarVC=TRUE'
VCAinference(myfitVCA_hypo, VarVC=TRUE)

## Hyper
PM_G2_mean_hyper <- PM_G2[PM_G2$hypohyper %in% "hyper", ] %>% 
  group_by(brotherPairID, G1_trt, G2_trt, ID) %>% 
  dplyr::summarize(MeanBetaValue = mean(BetaValue, na.rm=TRUE)) %>% data.frame()

varPlot(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper, 
        MeanLine=list(var=c("G1_trt", "G2_trt"), 
                      col=c("white", "blue"), lwd=c(2,2)), 
        BG=list(var="G2_trt", col=paste0("gray", c(80, 90))),
        YLabel=list(cex = .8, text="Mean beta value at parDMS \n hypermethylated upon infection"))

myfitVCA_hyper <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper) 

### Real values
trtEffect <- sum(myfitVCA_hyper$aov.tab[2:4, 5])
genEffect <- sum(myfitVCA_hyper$aov.tab[5:8, 5])
error <- sum(myfitVCA_hyper$aov.tab[9, 5])
realValHyperVCA <- data.frame(trtEffect=trtEffect, genEffect=genEffect,error=error)

### Randomisation
randomHyperVCA = do.call(rbind, lapply(1:1000, function(x) {
  df=myrandomVCA(PM_G2_mean_hyper)
  df$rep=x
  return(df)}))

randomHyperVCA = melt(randomHyperVCA, id.vars = "rep")
# saveRDS(randomHyperVCA, file = "Rdata/randomHyperVCA.RDS")
randomHyperVCA <- readRDS(file = "Rdata/randomHyperVCA.RDS")
df2=reshape2::melt(realValHyperVCA)

sumDF <- randomHyperVCA %>% 
  group_by(variable) %>%
  dplyr::summarize(value = mean(value)) %>% data.frame()

ggplot(randomHyperVCA, aes(x=variable, y=value))+
  geom_boxplot()+
  geom_jitter(width=.1, alpha=.2)+
  geom_point(data = df2, col = "red", size = 6)+
  geom_text(data=sumDF, aes(label=round(value)), col="white")+
  geom_text(data = df2, aes(label=round(value)), col="white")+
  theme_cleveland()+
  ggtitle("VCA with bootstrap N=1000 at hyper-parDMS", subtitle = "red: observed values")

# estimate 95% confidence intervals, request CI for
# all variance components via 'VarVC=TRUE'
VCAinference(myfitVCA_hypo, VarVC=TRUE)

################
## Hyper
PM_G2_mean_hyper <- PM_G2[PM_G2$hypohyper %in% "hyper", ] %>% 
  group_by(brotherPairID, G1_trt, G2_trt, ID) %>% 
  dplyr::summarize(MeanBetaValue = mean(BetaValue, na.rm=TRUE)) %>% data.frame()

varPlot(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper, 
        MeanLine=list(var=c("G1_trt", "G2_trt"), 
                      col=c("white", "blue"), lwd=c(2,2)), 
        BG=list(var="G2_trt", col=paste0("gray", c(80, 90))),
        YLabel=list(cex = .8, text="Mean beta value at parDMS \n hypermethylated upon infection"))

myfitVCA_hyper <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper) 
print(myfitVCA_hyper, digits=4)
# estimate 95% confidence intervals, request CI for
# all variance components via 'VarVC=TRUE'
VCAinference(myfitVCA_hyper, VarVC=TRUE)

##############
## In parents
##############
parmod <- lmer(data = PM_G1, BetaValue ~ meth.diff.parentals : Treatment + (1|CpGSite) + (1|brotherPairID))

## check normality of residuals assumption
qqnorm(resid(parmod))
qqline(resid(parmod))

pred <- ggpredict(parmod, terms = c("meth.diff.parentals", "Treatment"))
plot(pred, add.data = T)+
  scale_color_manual(values = c("black", "red"))+
  scale_y_continuous(name = "Beta values")+
  scale_x_continuous(name = "Methylation difference between infected and control parents in percentage")+
  ggtitle("Predicted methylation ratio (Beta) values in parents\n as a function of differential methylation between exposed and control groups")+
  theme_bw()

##############
## Linear model: does the beta value of offspring at DMS depends on treatment Parent x Offspring?
##############
modFull <- lmer(BetaValue ~ (G1_trt * G2_trt):hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID),data = PM_G2, REML = F) # REML =F for model comparison
mod_noG1trt <- lmer(BetaValue ~ G2_trt:hypohyper + (1|CpGSite)+ (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
mod_noG2trt <-lmer(BetaValue ~ G1_trt:hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
mod_noInteractions <- lmer(BetaValue ~ (G1_trt + G2_trt):hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
mod_noHypoHyper <- lmer(BetaValue ~ (G1_trt * G2_trt) + (1|CpGSite) + (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)

## check normality of residuals assumption
qqnorm(resid(modFull))
qqline(resid(modFull))

## Likelihood ratio tests for all variables:
lrtest(modFull, mod_noG1trt) # G1 trt is VERY VERY significant (LRT: χ² (4) = 1163.6, p < 0.001)
lrtest(modFull, mod_noG2trt) # G2 trt is VERY VERY significant (LRT: χ² (4) = 30.02, p < 0.001) NB that changed when brotherpair is used instead of family!
lrtest(modFull, mod_noInteractions) # interactions are significant (LRT: χ² (2) = 9.21, p < 0.01)
lrtest(modFull, mod_noHypoHyper) # hypo/hyper VERY VERY significant (LRT: χ² (4) = 1140, p < 0.001)

## Post-hoc tests between treatments
modFull <- lmer(BetaValue ~ (G1_trt * G2_trt):hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID),data = PM_G2)
modFull_emmeans <- emmeans(modFull, list(pairwise ~ (G1_trt:G2_trt):hypohyper), adjust = "tukey")
modFull_emmeans

P1 <- plot(modFull_emmeans, by = "hypohyper", comparisons = TRUE) +
  # coord_flip()+
  theme_bw() +
  ggtitle("Estimated marginal means of methylation ratio (beta)\n of offspring at parental DMS")+
  theme(legend.position = "none", axis.title.x = element_blank()) +
  scale_x_continuous("Beta value (methylation ratio)", limits = c(47,69.5))

## NB: Comparison arrows: https://cran.r-project.org/web/packages/emmeans/vignettes/xplanations.html
## two estimated marginal means (EMMs) differ significantly if, and only if, their respective comparison arrows do not overlap
## These comparison arrows are decidedly not the same as confidence intervals.
## Confidence intervals for EMMs are based on the statistical properties of the individual EMMs, whereas comparison arrows
## are based on the statistical properties of differences of EMMs.

## Add the PARENTAL DMS value
## Same test on ALL, G1 and G2 fish
modFullG1 <- lmer(BetaValue ~ G1_trt:hypohyper + (1|CpGSite) + (1|brotherPairID), data = PM_G1)

modFullG1_emmeans <- emmeans(modFullG1, list(pairwise ~ G1_trt:hypohyper), adjust = "tukey")
modFullG1_emmeans

P2 <- plot(modFullG1_emmeans, by = "hypohyper", comparisons = TRUE) +
  theme_bw() +
  ggtitle("Estimated marginal means of methylation ratio (beta)\n of parents at DMS")+
  theme(legend.position = "none", axis.title.x = element_blank()) +
  scale_x_continuous("Beta value (methylation ratio)", limits = c(47,69.5))

ggarrange(P2, P1, labels = c("A", "B"), ncol = 1, nrow = 2)

#####################################################################################
## Are the nbr of residuals methylation AT PARENTAL DMS different in the 4 G2 trt? ##
## (for hypo vs hypermeth)? ##
##############################

length(unique(PM_G1$CpGSite))# 3648 positions
PM_G1 %>% dplyr::count(ID)## NB: not all covered in all samples

length(unique(PM_G2$CpGSite[PM_G2$hypohyper %in% "hypo"]))# 1176 positions hypomethylated upon parental inf
length(unique(PM_G2$CpGSite[PM_G2$hypohyper %in% "hyper"]))# 2472 positions hypermethylated upon parental inf

myfun <- function(X){
  ## Calculate nbr of CpG hypo/hypermethylated per individual, and nbr of covered CpG:
  X <- X %>% group_by(ID, Treatment, brotherPairID, clutch.ID, Sex) %>%
    dplyr::summarise(ncov = n(),
                     hypoMeth = sum(BetaValue < 0.3),
                     hyperMeth = sum(BetaValue > 0.7)) %>% data.frame()
  # Calculate residuals of nbr of methCpG by nbr of covered CpG
  X$res_Nbr_methCpG_Nbr_coveredCpG_HYPO = residuals(lm(X$hypoMeth ~ X$ncov))
  X$res_Nbr_methCpG_Nbr_coveredCpG_HYPER = residuals(lm(X$hyperMeth ~ X$ncov))
  
  ## Statistical model: is it different in each offspring trt group?
  mod1 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
               data = X, REML = F)
  mod0 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ 1 + (1|brotherPairID/clutch.ID) + (1|Sex),
               data = X, REML = F)
  print(lrtest(mod1, mod0))
  
  ## Post-hoc test:
  modhypo <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
                  data = X)
  ## pairwise posthoc test
  print(emmeans(modhypo, list(pairwise ~ Treatment), adjust = "tukey"))
  
  mod3 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
               data = X, REML = F)
  mod4 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ 1 + (1|brotherPairID/clutch.ID) + (1|Sex),
               data = X, REML = F)
  print(lrtest(mod3, mod4))
  
  ## Post-hoc test:
  modhyper <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
                   data = X)
  ## pairwise posthoc test
  print(emmeans(modhyper, list(pairwise ~ Treatment), adjust = "tukey"))
  
  ## Plot
  phypo <- plot(ggpredict(modhypo, terms = c("Treatment")), add.data = TRUE)+
    scale_y_continuous("Residuals of number of hypomethylated methylated \ncytosines on number of cytosines covered") +
    ggtitle("Predicted residuals nbr of hypomethylated CpG")+
    theme_bw()
  
  phyper <- plot(ggpredict(modhyper, terms = c("Treatment")), add.data = TRUE)+
    scale_y_continuous("Residuals of number of hypermethylated methylated \n cytosines on number of cytosines covered") +
    ggtitle("Predicted residuals nbr of hypermethylated CpG")+
    theme_bw()
  return(list(phypo, phyper))
}

listplots <- myfun(X = PM_G2[PM_G2$hypohyper %in% "hypo",])
## NOT significant
annotate_figure(ggarrange(listplots[[1]], listplots[[2]],ncol = 2, nrow = 1),
                top = text_grob("Parental DMS hypomethylated upon infection, in offspring"))

listplots <- myfun(X = PM_G2[PM_G2$hypohyper %in% "hyper",])
## Treatment SIGNIFICANT in both excess hypo/hyper methylation **

# $`pairwise differences of Treatment`
## HYPO
# 1                       estimate   SE   df t.ratio p.value
# NE_control - E_control     23.71 7.28 10.3   3.257  0.0353
# NE_control - E_exposed     26.88 7.26 10.3   3.701  0.0172

## HYPER
# 1                       estimate   SE   df t.ratio p.value
# NE_control - E_control    -24.06 7.36 10.3  -3.269  0.0348
# NE_control - E_exposed    -27.07 7.34 10.3  -3.687  0.0177

annotate_figure(ggarrange(listplots[[1]], listplots[[2]],ncol = 2, nrow = 1),
                top = text_grob("Parental DMS hypermethylated upon infection, in offspring"))

## The beta values in parentalDMS in offspring follow the parental pattern hypo/hyper methylated upon infection

######################################################################################
## II. CORE DMS: Which of the parental DMS are also found in offspring comparisons? ##
######################################################################################
intersect(paste(DMSlist$DMS_15pc_G1_C_T$chr, DMSlist$DMS_15pc_G1_C_T$start),
          intersect(paste(DMSlist$DMS_15pc_CC_CT$chr, DMSlist$DMS_15pc_CC_CT$start),
                    paste(DMSlist$DMS_15pc_TC_TT$chr, DMSlist$DMS_15pc_TC_TT$start)))
## ONLY 4!!! "Gy_chrII 22196179"  "Gy_chrXII 10863858" "Gy_chrXVII 2658079" "Gy_chrXX 5344222" 

###############################################################
## CORE DMS: Which of the DMS are common to CC-CI and IC-II? ##
###############################################################
makeManP <- function(comp1, comp2){
  A <- methylKit::select(DMS1, which(paste(DMS1$chr, DMS1$start) %in% coreDMS))
  B <- as.data.frame(annotateWithGeneParts(as(A,"GRanges"),annotBed12)@members)
  A2 <- methylKit::select(DMS2, 
                          which(paste(DMS2$chr, DMS2$start) %in% coreDMS))
  B2 <- as.data.frame(annotateWithGeneParts(as(A2,"GRanges"),annotBed12)@members)
  
  ggarrange(
    makeManhattanPlots(DMSfile = DMS1, 
                       annotFile = as.data.frame(annotateWithGeneParts(as(DMS1,"GRanges"),annotBed12)@members),
                       GYgynogff = GYgynogff, mycols = c("red", "grey", "black", "green"), 
                       mytitle = paste0("Manhattan plot of ", comp1, " DMS")),
    makeManhattanPlots(DMSfile = DMS2, 
                       annotFile = as.data.frame(annotateWithGeneParts(as(DMS2,"GRanges"),annotBed12)@members),
                       GYgynogff = GYgynogff, mycols = c("red", "grey", "black", "green"), 
                       mytitle = paste0("Manhattan plot of ", comp2, " DMS")),
    makeManhattanPlots(DMSfile = A, annotFile = B, GYgynogff = GYgynogff, 
                       mycols = c("red", "grey", "black", "green"), mytitle = paste0("Manhattan plot core DMS in ", comp1)),
    makeManhattanPlots(DMSfile = A2, annotFile = B2, GYgynogff = GYgynogff, 
                       mycols = c("red", "grey", "black", "green"), mytitle = paste0("Manhattan plot core DMS in ", comp2)),
    labels = c("A", "B", "C", "D"), ncol = 1, nrow = 4, common.legend = T)
}

DMS1 = DMSlist$DMS_15pc_CC_CT # from: getDiffMeth(myuniteCov = uniteCov14_G2_woSexAndUnknowChr_G1CONTROL, myMetadata = fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2_NUM %in% c(5,6),])
DMS2 = DMSlist$DMS_15pc_TC_TT # from: getDiffMeth(myuniteCov = uniteCov14_G2_woSexAndUnknowChr_G1INFECTED, myMetadata = fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2_NUM %in% c(2,3),])
coreDMS <- intersect(paste(DMS1$chr, DMS1$start), paste(DMS2$chr, DMS2$start))

## Make Manhattan plot:
makeManP(comp1 = "CC-CI", comp2 = "IC-II")

## Annotation of the core DMS:
coreDMSmethylDiff <- methylKit::select(DMS1, which(paste(DMS1$chr, DMS1$start) %in% coreDMS))

## Differentially methylated sites:
subGOterms = getAnnotationFun(METHOBJ = coreDMSmethylDiff)

## Background annotations:
universeGOterms = getAnnotationFun(METHOBJ = uniteCov14_G2_woSexAndUnknowChrOVERLAP)

length(universeGOterms)# 16024


# as.vector(lapply(strsplit(as.character(TSSAssociation_DiffMeth15p$feature.name), "\\."), "[", 1))

# Using the genes with associated pop-DMS, we applied
# a conditional hypergeometric Gene Ontology (GO) term enrichment
# analysis (false discovery rate–corrected P ≤ 0.05) with the Ensembl
# stickleback annotation dataset “gaculeatus_gene_ensembl,” and all
# genes that were associated to any sequenced CpG site were used as
# universe. We identified overrepresented biological processes, molec-
#   ular functions, and cellular components using the packages GOstats
# version 2.5 (59) and GSEABase version 1.46 (60) and corrected for
# multiple testing using the false discovery rate method implemented
# in goEnrichment version 1.0 (61) in R version 3.6 (52). Figures were
# produced using ggplot2 version 3.2 (56)

#######################################################################
## TRANSMITTED DMS: Which of the DMS are found in CCvsIC and CIvsII? ##
#######################################################################
makeManP(DMS1 = DMS15pc_G1_controlG2_half, comp1 = "CC-IC",
         DMS2 = DMS15pc_G1_infectedG2_half, comp2 = "CI-II")

##############
## Plot mean Beta values of offsprings at parental DMS, per trt, along the chromosomes
##############
meanBeta_G2_simple <- PM_G2 %>% group_by(Chr, Pos, Treatment) %>%
  dplyr::summarize(Mean = mean(BetaValue, na.rm=TRUE))
names(meanBeta_G2_simple) <- c("Chr", "Pos", "Treatment_G2", "MeanBetaG2")

genome <- GYgynogff %>%
  mutate(chrom_nr=chrom %>% deroman(), chrom_order=factor(chrom_nr) %>% as.numeric()) %>% 
  arrange(chrom_order) %>%
  mutate(gstart=lag(length,default=0) %>% cumsum(), gend=gstart+length, 
         type=LETTERS[2-(chrom_order%%2)],   gmid=(gstart+gend)/2)

mydata = tibble(trt=meanBeta_G2_simple$Treatment_G2,
                chrom=meanBeta_G2_simple$Chr, pos=meanBeta_G2_simple$Pos, beta=meanBeta_G2_simple$MeanBetaG2)
mydata$pos <- as.numeric(mydata$pos)
mydata$pos <- as.numeric(mydata$pos)

table(mydata$chrom)## check that chrXIX and chrUN are well removed!!

# join DMS and genomic position
data = dplyr::left_join(mydata, genome) %>% dplyr::mutate(gpos=pos+gstart)

# plot:
ggplot()+
  geom_rect(data=genome,aes(xmin=gstart,xmax=gend,ymin=-Inf,ymax=Inf,fill=type), alpha=.2)+
  geom_point(data=data, aes(x=gpos,y=beta, shape=trt, col=trt),fill="white", size = 1)+
  scale_color_manual(values = colOffs)+
  scale_shape_manual(values=c(21,21,21,21))+
  scale_fill_manual(values=c(A=rgb(.9,.9,.9),B=NA),guide="none")+
  scale_x_continuous(breaks=genome$gmid,labels=genome$chrom %>% str_remove(.,"Gy_chr"),
                     position = "top",expand = c(0,0))+
  theme_minimal()+
  theme(panel.grid = element_blank(),
        axis.line=element_blank(),
        axis.title = element_blank(),
        strip.placement = "outside")+
  facet_grid(trt~.)+
  ggtitle("Mean methylation proportions at the 3648 parental DMS for each offspring group")

###########################################################################################
## Not conclusive: test hyp that beta value is different (higher?) in the center of the chromosome, 
## where there are less recombinations. Test for each chromosome
meanBeta_G2_extended <- PM_G2 %>% group_by(Chr, Pos, Treatment, Sex, brotherPairID) %>%
  dplyr::summarize(Mean = mean(BetaValue, na.rm=TRUE))
names(meanBeta_G2_extended) <- c("Chr", "Pos", "Treatment_G2","Sex", "brotherPairID", "MeanBetaG2")

mydata = tibble(trt=meanBeta_G2_extended$Treatment_G2, sex = meanBeta_G2_extended$Sex, brotherPairID = meanBeta_G2_extended$brotherPairID,
                chrom=meanBeta_G2_extended$Chr, pos=meanBeta_G2_extended$Pos, beta=meanBeta_G2_extended$MeanBetaG2)
mydata$pos <- as.numeric(mydata$pos)

# join DMS and genomic position
data = dplyr::left_join(mydata, genome) %>% dplyr::mutate(gpos=pos+gstart)

## Add distance to center
data$dist2mid <- abs(data$gmid - data$gpos)

mod <- lmer(beta ~ dist2mid:chrom + (1|trt) + (1|sex) + (1|brotherPairID), data = data, REML = F)
mod0 <- lmer(beta ~ dist2mid + (1|trt) + (1|sex) + (1|brotherPairID), data = data, REML = F)
mod00 <- lmer(beta ~ chrom + (1|trt) + (1|sex) + (1|brotherPairID), data = data, REML = F)

lrtest(mod, mod0) # chromosome matters
lrtest(mod0, mod00) # distance to middle matters

## check normality of residuals assumption
qqnorm(resid(mod))
qqline(resid(mod)) # quite skewed

pred <- ggpredict(mod, terms = c("dist2mid","chrom"))
plot(pred) +
  scale_color_manual(values = 1:20)